### ---- SCRIPT 04: The purpose of this script is to provide descriptive statistics for the 825 unique guests ---- ###
### ---- who have appeared on any of the seven selected programmes at any point. This includes summarising the ---- ###
### ---- breakdown of guest category and sub-category types, nationality, and distribution of ideal points by ---- ###
### ---- category type. It also summarises the Twitter profile metadata of guests and wider user groups.      ---- ###

options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visulisation
library(data.table) # big data
library(treemapify) # generate treemap plots
library(ggridges) # generate ridge plots 

#### ---- DIRECTORIES ---- ####

raw_dir <- "00-raw_data/"
processed_dir <- "01-processed_data/"
figure_dir <- "02-figures/"

#### ---- LOAD DATA ---- ####

# Guest masterlist
guest_data <- fread(paste0(processed_dir,"username_master_list_with_ideal_points.csv"))

# TV guest lists
guest_lists <- list.files(paste0(raw_dir,"guest_lists/"),full.names = TRUE) %>%
  lapply(fread)

# Twitter user data 
user_data <- fread(paste0(processed_dir,"user_ideal_point_data_updated.csv"))

#### ---- SUMMARISE TWITTER PROFILE METADATA ---- ####

# Create a dataframe that duplicates the user data as new guest category so that they can all be summarised as
# together in one table

user_data_dupe <- user_data 
user_data_dupe$elite_account <- "Overall"

guest_data_dupe <- guest_data %>% filter(!is.na(user_ideal_point)) 
guest_data_dupe$elite_account <- "Guest"

user_data_dupe <- rbind(user_data,user_data_dupe,guest_data_dupe, fill=TRUE)

user_data_summary <- user_data_dupe %>% 
  group_by(elite_account) %>%
  summarise(count = n(),
            verified = round(sum(verified,na.rm = T)/sum(count),3),
            median_mps_followed = median(mps_followed,na.rm = T),
            median_followers = median(followers_count,na.rm = T),
            median_following = median(following_count,na.rm = T),
            median_tweets = median(tweet_count,na.rm = T),
            median_listed = median(listed_count,na.rm = T),
            median_ideal = median(user_ideal_point,na.rm = T))

#### ---- SUMMARISE GUEST CATEGORIES ---- ####

# Firstly, summarise the overall guest data by main category type
guest_catergories_summary <- guest_data %>% 
  group_by(category_type) %>%
  summarise(count = n())

# Secondly, summarise the overall guest data by main category type *and* subcategory of each group
guest_subcatergories_summary <- guest_data %>% 
  group_by(category_type,category_name_1) %>%
  summarise(count = n())

# Plot as a treemap
subcategory_treemap <- ggplot(guest_subcatergories_summary, aes(area = count, label = category_name_1, fill = category_type,
                                                 subgroup = category_type)) +
  geom_treemap()  +
  geom_treemap_subgroup_border() +
  geom_treemap_subgroup_text(place = "centre", grow = T, alpha = 0.5, colour =
                               "black", fontface = "italic", min.size = 0) +
  geom_treemap_text(colour = "white", place = "topleft", reflow = T) +
  theme(legend.position = "")

ggsave(paste0(figure_dir,"guest_sub_categories_treemap.png"),
       subcategory_treemap ,
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Thirdly, summarise the overall guest data by organisation
guest_organisation_summary <- guest_data %>% 
  group_by(organisation) %>%
  summarise(count = n())

# Finally, summarise guest data by nationality
guest_nationality_summary <- guest_data %>% 
  group_by(nationality_type) %>%
  summarise(count = n())

#### ---- GUEST CATEGORY IDEAL POINT DISTRIBUTIONS ---- ####

# User ideal points by category type 

# Order categories by median ideal point for benefit of plot interpretability
category_ideal_point_plot_ordering <- guest_data %>% 
  group_by(category_type) %>%
  summarise(median_ideal_point = median(user_ideal_point,na.rm=T)) %>%
  arrange(desc(median_ideal_point)) %>%
  pull(category_type)

guest_data$category_type <- factor(guest_data$category_type, levels = category_ideal_point_plot_ordering)

# Raincloud distribution plot

ideal_point_distributions_by_cat <- ggplot(guest_data, aes(x = user_ideal_point, y = category_type, fill = category_type)) +
  geom_density_ridges(
    scale = 1, 
    alpha = 0.7,
    bandwidth = 0.1, 
    rel_min_height = 0.01) + 
  geom_boxplot(
    width = .15, 
    outlier.shape = NA, 
    alpha = 0.7
  ) +
  theme_minimal() +
  labs(y = "", x = "Ideal Point") +
  theme(axis.title.y = element_blank(), 
        legend.position = "none") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "black")

ggsave(paste0(figure_dir,"ideal_point_by_category_raincloud_plot.png"),
       ideal_point_distributions_by_cat, 
       units="in", width=7, height=4, dpi=300,
       bg="white")

#### ---- END ---- ####
